This script compares GSVA scores for hallmark and KEGG pathways between PBTA samples with and without DNA repair gene germline PLP variants
Load packages and set directories
library(tidyverse)
library(tibble)
library(ComplexHeatmap)
library(circlize)
Set filepaths
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "CLK1-splicing-impact-morpholino")
results_dir <- file.path(analysis_dir, "results")
plots_dir <- file.path(analysis_dir, "plots")
source(file.path(root_dir, "figures", "theme_for_plots.R"))
Set file paths
gsva_file <- file.path(results_dir, "expr_collapsed_clk1_ctrl_morpho_hallmark_gsva_scores.tsv")
de_gsva_file <- file.path(results_dir, "expr_collapsed_de_clk1_ctrl_morpho_hallmark_gsva_scores.tsv")
kegg_file <- file.path(results_dir, "expr_collapsed_clk1_ctrl_morpho_kegg_gsva_scores.tsv")
de_kegg_file <- file.path(results_dir, "expr_collapsed_de_clk1_ctrl_morpho_kegg_gsva_scores.tsv")
dna_file <- file.path(results_dir, "expr_collapsed_clk1_ctrl_morpho_dna_repair_gsva_scores.tsv")
de_dna_file <- file.path(results_dir, "expr_collapsed_de_clk1_ctrl_morpho_dna_repair_gsva_scores.tsv")
hall_splice_file <- file.path(results_dir, "expr_splice_clk1_ctrl_morpho_hallmark_gsva_scores.tsv")
hall_splice_onco_file <- file.path(results_dir, "expr_splice_onco_clk1_ctrl_morpho_hallmark_gsva_scores.tsv")
kegg_splice_file <- file.path(results_dir, "expr_splice_clk1_ctrl_morpho_kegg_gsva_scores.tsv")
kegg_splice_onco_file <- file.path(results_dir, "expr_splice_onco_clk1_ctrl_morpho_kegg_gsva_scores.tsv")
dna_splice_file <- file.path(results_dir, "expr_splice_clk1_ctrl_morpho_dna_repair_gsva_scores.tsv")
dna_splice_onco_file <- file.path(results_dir, "expr_splice_onco_clk1_ctrl_morpho_dna_repair_gsva_scores.tsv")
# reduce kegg pathways
hallmark_file <- file.path(root_dir, "analyses", "sample-psi-clustering", "input", "genesets.tsv")
Create histology file for grouping samples
hall_to_keep <- read_tsv(hallmark_file) %>%
filter(Keep == "Yes")
## Rows: 51 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Geneset, Keep
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_ids <- c("CTRL1", "CTRL2", "CTRL3", "Treated1", "Treated2", "Treated3")
treatments <- c("Non-targeting Morpholino",
"Non-targeting Morpholino",
"Non-targeting Morpholino",
"CLK1 Exon 4 Morpholino",
"CLK1 Exon 4 Morpholino",
"CLK1 Exon 4 Morpholino")
df <- tibble(sample_id = sample_ids, Treatment = treatments)
# read in scores and de results
hallmark_scores <- read_tsv(gsva_file) %>%
filter(geneset %in% hall_to_keep$Geneset)
## Rows: 300 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hallmark_pathways <- unique(hallmark_scores$geneset)
kegg_scores <- read_tsv(kegg_file)
## Rows: 1116 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kegg_pathways <- unique(kegg_scores$geneset)
dna_scores <- read_tsv(dna_file)
## Rows: 36 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dna_pathways <- unique(dna_scores$geneset)
hallmark_scores_de <- read_tsv(de_gsva_file) %>%
filter(geneset %in% hall_to_keep$Geneset)
## Rows: 300 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hallmark_pathways_de <- unique(hallmark_scores_de$geneset)
kegg_scores_de <- read_tsv(de_kegg_file)
## Rows: 1116 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kegg_pathways_de <- unique(kegg_scores_de$geneset)
dna_scores_de <- read_tsv(de_dna_file)
## Rows: 36 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dna_pathways_de <- unique(dna_scores_de$geneset)
# splicing
hallmark_scores_sp <- read_tsv(hall_splice_file) %>%
filter(geneset %in% hall_to_keep$Geneset)
## Rows: 276 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hallmark_pathways_sp <- unique(hallmark_scores_sp$geneset)
kegg_scores_sp <- read_tsv(kegg_splice_file)
## Rows: 660 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kegg_pathways_sp <- unique(kegg_scores_sp$geneset)
dna_scores_sp <- read_tsv(dna_splice_file)
## Rows: 36 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dna_pathways_sp <- unique(dna_scores_sp$geneset)
hallmark_scores_sp_onc <- read_tsv(hall_splice_onco_file) %>%
filter(geneset %in% hall_to_keep$Geneset)
## Rows: 276 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hallmark_pathways_sp_onc <- unique(hallmark_scores_sp_onc$geneset)
kegg_scores_sp_onc <- read_tsv(kegg_splice_onco_file)
## Rows: 660 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kegg_pathways_sp_onc <- unique(kegg_scores_sp_onc$geneset)
dna_scores_sp_onc <- read_tsv(dna_splice_onco_file)
## Rows: 36 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): sample_id, geneset
## dbl (1): gsva_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dna_pathways_sp_onc <- unique(dna_scores_sp_onc$geneset)
Run GSVA score comparisons between samples with and without CLK1 morpholino
# Define repair gene source input directories
treat_ids <- df %>%
filter(Treatment == "CLK1 Exon 4 Morpholino") %>%
pull(sample_id) %>%
unique()
ctrl_ids <- df %>%
filter(Treatment == "Non-targeting Morpholino") %>%
pull(sample_id) %>%
unique()
# Lists of score data frames and pathway lists
score_sources <- list(kegg_scores, hallmark_scores, dna_scores, kegg_scores_de, hallmark_scores_de, dna_scores_de,
kegg_scores_sp, hallmark_scores_sp, dna_scores_sp, kegg_scores_sp_onc, hallmark_scores_sp_onc, dna_scores_sp_onc)
pathway_sources <- list(kegg_pathways, hallmark_pathways, dna_pathways, kegg_pathways_de, hallmark_pathways_de, dna_pathways_de,
kegg_pathways_sp, hallmark_pathways_sp, dna_pathways_sp, kegg_pathways_sp_onc, hallmark_pathways_sp_onc, dna_pathways_sp_onc)
score_names <- c("kegg", "hallmark", "dna_repair", "kegg_de", "hallmark_de", "dna_repair_de",
"kegg_sp", "hallmark_sp", "dna_repair_sp", "kegg_sp_onc", "hallmark_sp_onc", "dna_repair_sp_onc") # Names for output files
for (i in 1:length(score_sources)) {
score_df <- score_sources[[i]]
pathway_list <- pathway_sources[[i]]
score_name <- score_names[i]
gsva_res <- data.frame(pathway = pathway_list,
treat_score = rep(0, length(pathway_list)),
ctrl_score = rep(0, length(pathway_list)),
score_diff = rep(0, length(pathway_list)),
wilcox_stat = rep(0, length(pathway_list)),
pvalue = rep(0, length(pathway_list)))
for (k in 1:length(pathway_list)) {
treat_scores <- score_df[score_df$geneset == pathway_list[k] & score_df$sample_id %in% treat_ids, ]$gsva_score
ctrl_scores <- score_df[score_df$geneset == pathway_list[k] & score_df$sample_id %in% ctrl_ids, ]$gsva_score
# Ensure treat_scores and ctrl_scores are numeric vectors
treat_scores <- as.numeric(treat_scores)
ctrl_scores <- as.numeric(ctrl_scores)
# Proceed with mean calculation and Wilcoxon test as before
gsva_res$treat_score[k] <- mean(treat_scores, na.rm = TRUE)
gsva_res$ctrl_score[k] <- mean(ctrl_scores, na.rm = TRUE)
gsva_res$score_diff[k] <- gsva_res$treat_score[k] - gsva_res$ctrl_score[k]
if (length(treat_scores) > 0 && length(ctrl_scores) > 0) {
test_result <- tryCatch({
wilcox.test(treat_scores, ctrl_scores, alternative = 'two.sided')
}, warning = function(w) {
return(list(statistic = NA, p.value = NA))
}, error = function(e) {
return(list(statistic = NA, p.value = NA))
})
gsva_res$wilcox_stat[k] <- test_result$statistic
gsva_res$pvalue[k] <- test_result$p.value
} else {
gsva_res$wilcox_stat[k] <- NA
gsva_res$pvalue[k] <- NA
}
}
# Write to output
output_filename <- paste0("gsva_score_diff_", score_name, ".tsv")
write_tsv(gsva_res, file.path(results_dir, output_filename))
}
Plot single samples
# create anno colors
top_annot <- df %>%
dplyr::select(Treatment) %>%
as.data.frame()
anno_col <- list(Treatment = c("Non-targeting Morpholino" = "#FFC20A", "CLK1 Exon 4 Morpholino" = "#0C7BDC"))
# Heatmap annotation
top_anno = HeatmapAnnotation(df = top_annot,
col = anno_col, show_legend = TRUE)
# plot
for (i in 1:length(score_sources)) {
score_df <- score_sources[[i]] %>%
mutate(geneset = gsub("HALLMARK_|KEGG_", "", geneset),
geneset = gsub("_", " ", geneset))
score_df_summarized <- score_df %>%
mutate(Treatment = ifelse(grepl("CTRL", sample_id), "Non-targeting Morpholino", "CLK1 Exon 4 Morpholino")) %>%
group_by(geneset, Treatment) %>%
summarise(Mean_GSVA_Score = mean(gsva_score, na.rm = TRUE), .groups = 'drop')
# Re-order
ordering_metric <- score_df_summarized %>%
filter(Treatment == "CLK1 Exon 4 Morpholino") %>%
arrange(desc(Mean_GSVA_Score)) %>%
pull(geneset)
# Apply this ordering to the 'geneset' factor
score_df_summarized$geneset <- factor(score_df_summarized$geneset, levels = ordering_metric)
gsva_barplot <- ggplot(score_df_summarized, aes(x = geneset, y = Mean_GSVA_Score, fill = Treatment)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme_Publication() +
labs(x = "Geneset", y = "Mean GSVA score") +
scale_fill_manual(values = c("Non-targeting Morpholino" = "#FFC20A", "CLK1 Exon 4 Morpholino" = "#0C7BDC")) +
coord_flip() +
theme(legend.position = "top") # Positions the legend at the top
score_name <- score_names[i]
gsva_scores_mat <- score_df %>%
spread(., sample_id, gsva_score) %>%
tibble::column_to_rownames("geneset") %>%
as.matrix()
if (i %in% c(1,4,7,10)){
height <- 30
width <- 12
cell_width <- 3
}
else if (i %in% c(2,5,8,11)){
height <- 10
width <- 8
cell_width <- 3
}
else if (i %in% c(3,6,9,12)){
height <- 3.5
width <- 5.5
cell_width <- 3
}
heatmap_obj <- Heatmap(
gsva_scores_mat,
col = colorRamp2(c(-1, 0, 1), c("blue", "white", "darkorange")),
name = "GSVA Scores",
na_col = "whitesmoke",
width = unit(cell_width, "cm"),
show_column_names = FALSE,
cluster_columns = FALSE,
row_names_gp = grid::gpar(fontsize = 12),
top_annotation = top_anno,
heatmap_legend_param = list(direction = "horizontal"),
)
gsva_heatmap <- draw(heatmap_obj, annotation_legend_side = "top", heatmap_legend_side = "top")
output_filename <- paste0("gsva_heatmap_", score_name, ".pdf")
pdf(file.path(plots_dir, output_filename), height = height, width = width)
print(gsva_heatmap)
print(gsva_barplot)
dev.off()
}
Print session info
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggthemes_5.1.0 circlize_0.4.16 ComplexHeatmap_2.20.0
## [4] lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
## [7] dplyr_1.1.4 purrr_1.0.4 readr_2.1.5
## [10] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1
## [13] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 shape_1.4.6.1 rjson_0.2.21
## [4] xfun_0.50 bslib_0.9.0 GlobalOptions_0.1.2
## [7] tzdb_0.4.0 vctrs_0.6.5 tools_4.4.0
## [10] generics_0.1.3 stats4_4.4.0 parallel_4.4.0
## [13] cluster_2.1.8 pkgconfig_2.0.3 RColorBrewer_1.1-3
## [16] S4Vectors_0.42.1 lifecycle_1.0.4 farver_2.1.2
## [19] compiler_4.4.0 munsell_0.5.1 codetools_0.2-20
## [22] clue_0.3-65 htmltools_0.5.8.1 sass_0.4.9
## [25] yaml_2.3.10 pillar_1.10.1 crayon_1.5.3
## [28] jquerylib_0.1.4 cachem_1.1.0 magick_2.8.3
## [31] iterators_1.0.14 foreach_1.5.2 tidyselect_1.2.1
## [34] digest_0.6.37 stringi_1.8.4 labeling_0.4.3
## [37] rprojroot_2.0.4 fastmap_1.2.0 colorspace_2.1-1
## [40] cli_3.6.4 magrittr_2.0.3 withr_3.0.2
## [43] scales_1.3.0 bit64_4.6.0-1 timechange_0.3.0
## [46] rmarkdown_2.29 matrixStats_1.3.0 bit_4.5.0.1
## [49] png_0.1-8 GetoptLong_1.0.5 hms_1.1.3
## [52] evaluate_1.0.3 knitr_1.49 IRanges_2.38.1
## [55] doParallel_1.0.17 rlang_1.1.5 Rcpp_1.0.14
## [58] glue_1.8.0 BiocGenerics_0.50.0 vroom_1.6.5
## [61] jsonlite_1.8.9 R6_2.6.1